Yeol Ye
University of Chicago
ziyuye@uchicago.edu
import prep
import pandas as pd
import datetime
import numpy as np
import seaborn as sns
from scipy.stats import norm, lognorm
import statsmodels.api as sm
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
%matplotlib inline
from datetime import datetime, timedelta
import os
import sys
import warnings
sys.path.append('../code/')
warnings.filterwarnings('ignore')
data_file = 'projects_2012_2013.csv'
df = pd.read_csv('../data/source/projects_2012_2013.csv')
target_name = 'fully_funded'
num_list = ['school_latitude', 'school_longitude',
'total_price_including_optional_support', 'students_reached']
df['date_posted'] = pd.to_datetime(df['date_posted'])
df['datefullyfunded'] = pd.to_datetime(df['datefullyfunded'])
df['fully_funded'] = (df['datefullyfunded'] - df['date_posted']) < timedelta(days=60)
df = df.astype({'school_latitude': np.float, 'school_longitude': np.float, 'fully_funded': np.float,
'total_price_including_optional_support': np.float, 'students_reached': np.float})
target, features = prep.target_features_split(target_name, df)
cat, numeric = _num_cat_split(features, num_list)
df.info()
__df = df_cleaned[:]
__df = __df.drop('date_posted', axis=1)
__df = __df.drop('datefullyfunded', axis=1)
import seaborn as sns
import matplotlib
sns.set(style='whitegrid')
%matplotlib inline
import explore
%config InlineBackend.figure_format = 'retina'
explore.count_plot(target_name, target, 'Count Plot for the Target')
explore.count_plot('grade_level', features, 'Count Plot for Grade Levels')
explore.count_plot('school_charter', features, 'Count Plot for Grade Levels')
explore.count_plot('school_magnet', features, 'Count Plot for Grade Levels')
explore.count_plot('teacher_prefix', features, 'Count Plot for Grade Levels')
explore.count_plot('school_metro', features, 'Count Plot for Grade Levels')
plt.rcParams['figure.figsize'] = (14.0, 5.0)
explore.count_plot('primary_focus_area', features, 'Count Plot for Grade Levels')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
explore.count_plot('resource_type', features, 'Count Plot for Grade Levels')
explore.count_plot('poverty_level', features, 'Count Plot for Grade Levels')
explore.box_plot(numeric)
Giving the plot above, we could find that there exists some outliers in the data, and it would be useful to filter out all these outliers and see the distribution of the data. The following function for distribution used the data between the 0.05 quantile and 0.95 quantile.
explore.dist_plot(numeric)
explore.corr_plot(df)
In this sample, we do not find any feature has high correlation with one another, thus we do not need to drop features this time. : )
temp = list(features.columns)
temp.append(target_name)
sorted_data = df[temp]
explore.pair_plot(sorted_data, target_name)
From the plots, we can see that the two classes of the target value seem to be somewhat separated by some of the features, e.g.
Total Price Including Optional Support.This implies the potential feasibility to use a machine learning algorithm to separate the data by label.Note that it is also possible to transform category data into numeric (i.e dummy) data and generate the pair plot. This step is skipped here due to limited space for images. The code part of this project provides such a method.
import feature
list_to_drop = ['projectid', 'teacher_acctid', 'schoolid', 'school_ncesid', 'school_city', 'school_district',
'school_state', 'school_county']
df_cleaned = df.drop(list_to_drop, axis=1)
By inspecting the distribution of numeric features, I decide not to cut outliers, as they are very few and represent meaningful information. Although this step is skipped here, you can find relavant function in the code section of the project.
By inspecting the content of numeric features, I decide that they should be considered as continuous variable rather than discrete variable, thus this step is skipped here. Although this step is skipped here, you can find relavant function in the code section of the project.
df_cleaned = feature.one_hot_encoding_all(df_cleaned)
df_cleaned.info()
null_vars = list(df_cleaned.columns[df_cleaned.isnull().any()])
null_vars
df_cleaned = fill_null(df_cleaned)
df_cleaned.columns
import model
import graphviz
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ParameterGrid
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import export_graphviz
from sklearn.metrics import *
models = ['LR', 'KNN', 'DT', 'RF', 'AB', 'BAG']
clfs = {'LR': LogisticRegression(),
'KNN': KNeighborsClassifier(),
'DT': DecisionTreeClassifier(),
'RF': RandomForestClassifier(),
'AB': AdaBoostClassifier(),
'BAG': BaggingClassifier()}
grid = {'LR': { 'penalty': ['l1','l2'], 'C': [0.01,0.1,1,10]},
'KNN' :{'n_neighbors': [1,5,10,25,50,100], 'weights': ['uniform', 'distance'],
'algorithm': ['auto','ball_tree','kd_tree']},
'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [1,5,10,20,50], 'min_samples_split': [5,25]},
'RF':{'n_estimators': [10,100], 'max_depth': [5,25], 'max_features': ['sqrt','log2'],
'min_samples_split': [5,10], 'n_jobs': [-1]},
'AB': { 'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [1,10,100]},
'BAG': {'n_estimators': [10,100]}}
train_sets, test_sets, times = model.temporal_train_test_split(df_cleaned, 'date_posted', period='6M')
train_sets, test_sets = model.drop_time_col(train_sets, test_sets, 'datefullyfunded')
results_df = clfs_loop_temporal(train_sets, test_sets, 'fully_funded', models, clfs, grid)
table_0 = results_df[0]
table_0
table_1 = results_df[1]
table_1
table_2 = results_df[2]
table_2
writer = pd.ExcelWriter('Model Evaluation.xlsx', engine='xlsxwriter')
table_0.to_excel(writer, sheet_name='Sheet 0')
table_1.to_excel(writer, sheet_name='Sheet 1')
table_2.to_excel(writer, sheet_name='Sheet 2')
writer.save()
results_df = clfs_loop_temporal(train_sets, test_sets, 'fully_funded', models, clfs)
import report
metrics=['auc-roc','p_at_1', 'p_at_2', 'p_at_5', 'p_at_10', 'p_at_20', 'p_at_30', 'p_at_50']
tables = [table_0, table_1, table_2]
report.generate_whole_report(tables, metrics)